outage_duration. They have tracked three different outage durations, 0 for no outage, 1 for short outages that last anywhere between a few minutes and a maximum of 2 hours, and 2 for long outages that can last from 2 hours to sometimes even a couple of days.You will now have to use these metrics that the company has tracked to create a machine learning model that will be able to predict the outage_duration so that the company can better handle outages and improve customer satisfaction and therefore reduce customer churn.¶import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
#!pip install plotly
import plotly.offline as pyo
import plotly.graph_objs as go
from scipy.stats.mstats import mode
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
server_data = pd.read_csv('server_data.csv')
report_data = pd.read_csv('report_data.csv')
broadband_data = pd.read_csv('broadband_data.csv')
outage_data = pd.read_csv('outage_data.csv')
print('The shape of broadband_data is: {}\n'.format(broadband_data.shape))
print('The shape of outage_data is: {}\n'.format(outage_data.shape))
print('The shape of report_data is: {}\n'.format(report_data.shape))
print('The shape of server_data is: {}\n'.format(server_data.shape))
print('The shape of train_data is: {}'.format(train_data.shape))
print('The shape of test_data is: {}'.format(test_data.shape))
print(outage_data.nunique())
print('\n')
print(train_data.nunique())
print('\n')
print(broadband_data.nunique())
print('\n')
print(server_data.nunique())
print('\n')
print(report_data.nunique())
print('\n')
print(test_data.nunique())
print('\n')
## Checking the head of all Data sets
print(train_data.head())
print('\n')
print(server_data.head())
print('\n')
print(report_data.head())
print('\n')
print(broadband_data.head())
print('\n')
print(outage_data.head())
print('\n')
print(test_data.head())
print('\n')
## outage duration in train_data
train_data['outage_duration'].value_counts()
## plot for ouatge duration
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
plt.figure(figsize = (14,6))
sns.countplot(train_data['outage_duration'])
plt.tight_layout()
plt.show()
val=list(train_data['outage_duration'].value_counts())
for i in range(len(val)):
print(train_data['outage_duration'].value_counts().index[i],round(val[i]/sum(val)*100),'%')
0 for no outage1 for short outages that last anywhere between a few minutes and a maximum of 2 hours2 for long outages that can last from 2 hours to sometimes even a couple of days.data = [go.Scatter(x=train_data.loc[train_data.outage_duration==0,'area_code'],
y=train_data.loc[train_data.outage_duration==0,'id'],
mode="markers",text=train_data["outage_duration"]),
go.Scatter(x=train_data.loc[train_data.outage_duration==1,'area_code'],
y=train_data.loc[train_data.outage_duration==1,'id'],
mode="markers",text=train_data["outage_duration"]),
go.Scatter(x=train_data.loc[train_data.outage_duration==2,'area_code'],
y=train_data.loc[train_data.outage_duration==2,'id'],
mode="markers",text=train_data["outage_duration"])]
layout=go.Layout(title="ID vs Area Code",xaxis={"title":"area_code"},yaxis=dict(title="ID"),hovermode="closest")
fig=go.Figure(data,layout)
fig.show()
broadband_data.head()
broadband_data.shape
broadband_data['broadband_type'].value_counts()
##Percentage of Broadband types through Value counts in the Data
val=list(broadband_data['broadband_type'].value_counts())
for i in range(len(val)):
print(broadband_data['broadband_type'].value_counts().index[i],round(val[i]/sum(val)*100),'%')
#count plot for broadband type
plt.figure(figsize = (14,6))
sns.countplot(broadband_data['broadband_type'])
plt.tight_layout()
plt.show()
outage_data.head()
outage_data.shape
outage_data['outage_type'].value_counts()
#count plot for Outage Duration
plt.figure(figsize = (14,6))
sns.countplot(outage_data['outage_type'])
plt.tight_layout()
plt.show()
## outage_type in percentage
val=list(outage_data['outage_type'].value_counts())
for i in range(len(val)):
print(outage_data['outage_type'].value_counts().index[i],round(val[i]/sum(val)*100),'%')
report_data.head()
report_data.shape
report_data.nunique()
log_report_ = report_data['log_report_type'].value_counts()
log_report_.head(10)
#count plot for Log Report Type
plt.figure(figsize = (30,8))
sns.countplot(report_data['log_report_type'].value_counts())
plt.tight_layout()
plt.show()
server_data.shape
server_data.head()
server_data.nunique().value_counts()
server_count = server_data['transit_server_type'].value_counts()
server_count
#plotting with ID vs transit server type
# x and y given as array_like objects
import plotly.express as px
fig = px.scatter(server_data['id'], server_data['transit_server_type'])
fig.show()
server_count.head()
# plotting server typr count
#count plot for transit server type
plt.figure(figsize = (32,12))
sns.countplot(server_data['transit_server_type'])
plt.tight_layout()
plt.show()
train_broadband_merge = pd.merge(train_data,broadband_data,left_on = ['id'],
right_on = ['id'],how='left')
train_broadband_merge.head()
train_broadband_merge.shape
##ploting the data sets to check the inputs with train data
broad_band=train_broadband_merge.broadband_type.value_counts()
broad_band.head(10).plot(kind="bar",figsize=(10,10))
broad_band.head(10)
train_server_merge = pd.merge(train_data,server_data,left_on = ['id'],
right_on = ['id'],how='left')
train_server_merge.head()
train_server_merge.shape
# plotting server type and outage type
server_outage=train_server_merge.transit_server_type.value_counts()
server_outage.head(10)
server_outage.head(10).plot(kind="bar",figsize=(10,10))
train_outage_merge = pd.merge(train_data,outage_data,left_on = ['id'],
right_on = ['id'],how='left')
train_outage_merge.head()
train_outage_merge.shape
# plotting area type and outage duration
outage_duration=train_outage_merge.area_code.value_counts()
outage_duration.head(10)
outage_duration.head(10).plot(kind="bar",figsize=(10,10))
train_report_merge = pd.merge(train_data,report_data,left_on = ['id'],
right_on = ['id'],how='left')
train_report_merge.head()
train_report_merge.shape
# plotting log_report_type and outage duration
report_outage=train_report_merge.log_report_type.value_counts()
report_outage.head(10)
report_outage.head(10).plot(kind="bar",figsize=(10,10))
## Adding a column in train_data
train_data['train'] = 'yes'
test_data['train'] = 'no'
# Checking the head of train and test_data
print(train_data.head())
print('\n')
print(test_data.head())
print('\n')
temp_data = pd.concat([train_data,test_data], ignore_index=True)
# Checking the train and test_data shape after merge
temp_data.shape
temp_data.head()
temp_data.tail()
## arranging the columns
data_columns=["id","area_code","train","outage_duration"]
data_columns
data = temp_data[data_columns]
data.head()
server_data=server_data.merge(data,on='id',how='left')
## Checking the top rows of the server data after merging with train and test data
server_data.head()
server_data_temp=pd.DataFrame(server_data['transit_server_type'].value_counts())
## checking the top most transit_server_type
server_data_temp.head()
server_data_temp.nunique()
## checking how percentage of transit_server_type involved in the train_data
server_data_temp['Perc_Train'] = server_data.pivot_table(values='train',index='transit_server_type',aggfunc=lambda x: sum(x=='yes')/float(len(x)))
server_data_temp.head()
server_data_temp
server_data_temp['Mode_outage_duration'] = server_data.loc[server_data['train']=='yes'].pivot_table(values='outage_duration',index='transit_server_type', aggfunc=lambda x: mode(x).mode[0])
server_data_temp.iloc[-10:]
server_data_temp.nunique()
server_data_temp['preprocess'] = server_data_temp.index.values
bottomlimit = 30
server_data_temp['preprocess'].iloc[bottomlimit:] = server_data_temp['Mode_outage_duration'].iloc[bottomlimit:].apply(lambda x: 'Remove' if pd.isnull(x) else 'transit_server_other_%d'%int(x))
server_data_temp['preprocess'].iloc[33:]
print (server_data_temp['preprocess'].value_counts())
print (server_data_temp)
server_data = server_data.merge(server_data_temp[['preprocess']], left_on='transit_server_type',right_index=True)
print (server_data.head())
server_data['preprocess'].value_counts()
server_data_merge = server_data.pivot_table(values='transit_server_type',index='id',columns='preprocess',aggfunc=lambda x: len(x), fill_value=0)
server_data_merge.shape
server_data_merge.columns
server_data_merge.head()
server_data.shape
server_data_train=data.merge(server_data_merge,left_on='id',right_index=True)
server_data_train.head()
server_data_train.shape
server_temp=server_data.copy()
server_temp=server_temp.drop(['preprocess'],axis=1)
server_temp.shape
server_temp_merge=server_temp.merge(server_data_merge,left_on='id',right_index=True)
server_temp_merge.shape
server_temp_merge.duplicated('id').sum()
server_temp_merge.loc[server_temp_merge.duplicated(subset='id',keep=False),:]
server_data_dup=server_data.merge(server_data_merge,left_on='id',right_index=True)
server_data_dup.head()
server_data_dup.shape
server_data_dup.loc[server_data_dup.duplicated(subset='id',keep=False),:]
data.head()
report_data.head()
report_data.shape
data.shape
report_data=report_data.merge(data,on='id',how='left')
report_data.head()
report_data_temp=pd.DataFrame(report_data['log_report_type'].value_counts())
report_data_temp.head()
report_data_temp['Perc_Train'] = report_data.pivot_table(values='train',index='log_report_type',aggfunc=lambda x: sum(x=='yes')/float(len(x)))
report_data_temp.head()
report_data_temp['Mode_outage_duration']=report_data.loc[report_data['train']=='yes'].pivot_table(values='outage_duration',index='log_report_type',aggfunc=lambda x: mode(x).mode[0])
len(report_data_temp)
report_data_temp['preprocess']=report_data_temp.index.values
report_data_temp['preprocess'].loc[report_data_temp['Perc_Train']==1] = np.nan
report_data_temp[-20:]
report_data_temp[:5]
limit=140
report_data_temp['preprocess'].iloc[limit:]=report_data_temp['Mode_outage_duration'].iloc[limit:].apply(lambda x: 'Remove' if pd.isnull(x) else 'log_report_other%d'%int(x))
print (report_data_temp['preprocess'].value_counts())
report_data_temp
report_data=report_data.merge(report_data_temp[['preprocess']],left_on='log_report_type',right_index=True)
print(report_data.head())
report_data['preprocess'].value_counts()
report_data_merge = report_data.pivot_table(values='volume',index='id',columns='preprocess',aggfunc=np.sum, fill_value=0)
report_data_merge.shape
report_data_merge.sum().sum()
report_data_merge.head(-20)
server_report_train=server_data_train.merge(report_data_merge,left_on='id',right_index=True)
print(server_report_train.shape)
server_report_train.head()
outage_data.head()
data.head()
outage_data['outage_type'].value_counts()
outage_data=outage_data.merge(data,on='id',how='left')
outage_data.head()
outage_data.shape
outage_data_temp = pd.DataFrame(outage_data['outage_type'].value_counts())
outage_data_temp.head()
outage_data_temp['Perc_Train']=outage_data.pivot_table(values='train',index='outage_type',aggfunc=lambda x: sum(x=='yes')/float(len(x)))
outage_data_temp.head()
outage_data_temp['Mode_outage_duration']=outage_data.loc[outage_data['train']=='yes'].pivot_table(values='outage_duration',index='outage_type',aggfunc=lambda x: mode(x).mode[0])
outage_data_temp
outage_data.loc[server_data['train']=='yes'].pivot_table(values='outage_duration',index='outage_type', aggfunc=lambda x: mode(x))
outage_data_merge = outage_data.pivot_table(values='train',index='id',columns='outage_type',aggfunc=lambda x: len(x), fill_value=0)
outage_data_merge.head()
broadband_data.head()
broadband_data.shape
broadband_data['broadband_type'].value_counts()
broadband_data = broadband_data.merge(data, on='id',how='left')
broadband_data.head()
broadband_data.shape
broadband_data_temp = pd.DataFrame(broadband_data['broadband_type'].value_counts())
broadband_data_temp.head()
broadband_data_temp['PercT_rain'] = broadband_data.pivot_table(values='train',index='broadband_type',aggfunc=lambda x: sum(x=='yes')/float(len(x)))
broadband_data_temp
broadband_data_temp['Mode_outage_duration'] = broadband_data.loc[broadband_data['train']=='yes'].pivot_table(values='outage_duration',index='broadband_type', aggfunc=lambda x: mode(x).mode[0])
broadband_data_temp
broadband_data.loc[broadband_data['broadband_type']=='broadband_type_5']
broadband_data_merge = broadband_data.pivot_table(values='train',index='id',columns='broadband_type',aggfunc=lambda x: len(x), fill_value=0)
broadband_data_merge.head()
server_report_broadband_train=server_report_train.merge(broadband_data_merge,left_on='id',right_index=True)
print(server_report_broadband_train.shape)
server_report_broadband_train.head()
server_report_broadband_outage_train=server_report_broadband_train.merge(outage_data_merge,left_on='id',right_index=True)
server_report_broadband_outage_train.head()
server_report_broadband_outage_train.dtypes
server_report_broadband_outage_train.shape
broadband_data['broadband_type'].value_counts()
report_data['log_report_type'].value_counts()
report_data.head()
train_outage_2=train_data.loc[train_data.outage_duration==2]
server_report_broadband_outage_train.shape
broadband_outage2=broadband_data.loc[broadband_data.outage_duration==2]
broadband_outage2.shape
broadband_outage2.head()
import plotly
import plotly.offline as pyoff
# import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
%matplotlib notebook
temp = train_outage_2.area_code.value_counts()
data = [go.Bar(
x=temp.index,
y=temp)]
iplot(data)
broadband_temp= broadband_outage2.broadband_type.value_counts()
broadband_temp
data = [go.Bar(
x=broadband_temp.index,
y=np.round(broadband_temp.astype(float)/broadband_temp.values.sum(),2),
text = np.round(broadband_temp.astype(float)/broadband_temp.values.sum(),2),
textposition = 'auto')]
iplot(data)
broadband_data['last_digit']=broadband_data['broadband_type'].apply(lambda x: int(x.split("_")[2]))
broadband_data.head()
broadband_data['classification']=broadband_data['last_digit'].apply(lambda x: 'DSL' if x==8 else ('DSL' if x==2 else('DSL' if x==6 else('Cable' if x==7 else('BPL' if x==9 else 'Fibre')))))
broadband_data.head()
broadband_classified=broadband_data.loc[broadband_data.outage_duration==2].classification.value_counts()
data = [go.Bar(
x=broadband_classified.index,
y=np.round(broadband_classified.astype(float)/broadband_classified.values.sum(),2),
text = np.round(broadband_classified.astype(float)/broadband_classified.values.sum(),2),
textposition = 'auto')]
iplot(data)
broadband_alloutage=broadband_data.broadband_type.value_counts()
data = [go.Bar(
x=broadband_alloutage.index,
y=np.round(broadband_alloutage.astype(float)/broadband_alloutage.values.sum(),2),
text = np.round(broadband_alloutage.astype(float)/broadband_alloutage.values.sum(),2),
textposition = 'auto')]
iplot(data)
broadband_alloutage_classified=broadband_data.classification.value_counts()
data = [go.Bar(
x=broadband_alloutage_classified.index,
y=np.round(broadband_alloutage_classified.astype(float)/broadband_alloutage_classified.values.sum(),2),
text = np.round(broadband_alloutage_classified.astype(float)/broadband_alloutage_classified.values.sum(),2),
textposition = 'auto')]
iplot(data)
outage_data.head()
outage_data_plot=outage_data.dropna()
outage_data_plot.head()
plt.figure(figsize=(10,5))
sns.countplot(x="outage_type", hue="outage_duration", data=outage_data_plot)
report_data.head()
report_volume=report_data.groupby('log_report_type').volume.sum()
report_largest=report_data.groupby('log_report_type').volume.sum().nlargest(30)
data = [go.Bar(
x=report_largest.index,
y=report_largest,
text = report_largest,
textposition = 'auto')]
iplot(data)
server_data.transit_server_type.value_counts()
server_plot=server_data.preprocess.value_counts()
data = [go.Bar(
x=server_plot.index,
y=server_plot,
text = server_plot,
textposition = 'auto')]
iplot(data)
server_report_broadband_outage_train.shape
server_report_broadband_outage_train.head()
[x for x in server_report_broadband_outage_train.columns if 'Remove' in x]
server_report_broadband_outage_train.drop(['Remove_x','Remove_y'],axis=1,inplace=True)
server_report_broadband_outage_train.head()
server_report_broadband_outage_train.shape
data_new=server_report_broadband_outage_train.copy()
x=data_new.pop('outage_duration')
data_new['outage_duration']=x
data_new.head()
data_new.shape
data_new.dtypes
data_new.head()
le = LabelEncoder()
data_new['area_code'] = le.fit_transform(data_new['area_code'])
data_new.head()
train=data_new.loc[data_new.train=='yes']
train.tail(5)
train=train.drop('train',axis=1)
train.head()
test=data_new.loc[data_new.train=='no']
test=test.drop('train',axis=1)
test.head()
X, Y = train.loc[:,train.columns!='outage_duration'], train.loc[:,'outage_duration']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify =Y,random_state=123)
from sklearn.metrics import accuracy_score, classification_report
clf = LogisticRegression()
clf = clf.fit(X_train, Y_train)
traine_pred = clf.predict(X_train)
teste_pred = clf.predict(X_test)
print(classification_report(Y_test,teste_pred))
print(classification_report(Y_train,traine_pred))
clf1=RandomForestClassifier()
clf1 = clf1.fit(X_train, Y_train)
traine_pred = clf1.predict(X_train)
teste_pred = clf1.predict(X_test)
print(classification_report(Y_train, traine_pred))
print(classification_report(Y_test, teste_pred))
clf2 = tree.DecisionTreeClassifier()
clf2 = clf2.fit(X_train, Y_train)
traine_pred_df = clf2.predict(X_train)
teste_pred_df = clf2.predict(X_test)
print(classification_report(Y_train,traine_pred_df))
print('\n')
print('\n')
print(classification_report(Y_test,teste_pred_df))
param_grid = {"criterion": ['entropy'],
'min_samples_split': [5,10,20],
'max_depth': [2,5,10,15,30],
'max_leaf_nodes': [100,120,135,150], }
dt = tree.DecisionTreeClassifier()
clf_3_cv = GridSearchCV(dt, param_grid, cv = 5,n_jobs=-1)
clf_3_cv.fit(X_train, Y_train)
traine_pred = clf_3_cv.predict(X_train)
teste_pred = clf_3_cv.predict(X_test)
clf_3_cv.best_params_
print(classification_report(Y_train,traine_pred))
print('\n')
print('\n')
print(classification_report(Y_test,teste_pred))
clf4 =GaussianNB()
clf4 = clf4.fit(X_train, Y_train)
traine_pred_df = clf4.predict(X_train)
teste_pred_df = clf4.predict(X_test)
print(classification_report(Y_train, traine_pred))
print(classification_report(Y_test, teste_pred))
clf5=AdaBoostClassifier()
clf5 = clf5.fit(X_train, Y_train)
traine_pred_df = clf5.predict(X_train)
teste_pred_df = clf5.predict(X_test)
print(classification_report(Y_train, traine_pred))
print(classification_report(Y_test, teste_pred))
test=test.drop('outage_duration',axis=1)
test.head()
test_pred_1=clf_3_cv.predict(test)
test_pred_1
final_dt=pd.DataFrame({'id':test['id'],'outage_duration':test_pred_1})
final_dt.head()
final_dt.reset_index()
final_dt.reset_index(drop=True)
## grid search cv
final_dt.to_csv("C:/Users/admin/Desktop/Mohammed Final Exam//Submission_1_PHD.csv",index=False)
test_pred_2 = clf4.predict(test)
test_pred_2
final_dt2=pd.DataFrame({'id':test['id'],'outage_duration':test_pred_2})
# GaussianNB
final_dt2.to_csv("C:/Users/admin/Desktop/Mohammed Final Exam//Submission_2_PHD.csv",index=False)